Code Chunk #1: Basic data set exploration, modification of column classes, basic tables and barplots.
##### Data Exploration: understanding titanic data --------------------
# The sinking of the RMS Titanic is one of the most infamous shipwrecks in history.
# On April 15, 1912, during her maiden voyage, the Titanic sank after colliding with
# an iceberg, killing 1502 out of 2224 passengers and crew.
# This sensational tragedy shocked the international community and led to better
# safety regulations for ships.One of the reasons that the shipwreck led to such
# loss of life was that there were not enough lifeboats for the passengers and crew.
# Although there was some element of luck involved in surviving the sinking,
# some groups of people such as women, children, and the upper-class
# were more likely to survive than others.
# VARIABLE DESCRIPTIONS:
# PassengerID Unique passenger identifier
# Survived Survival (0 = No; 1 = Yes)
# Pclass Passenger Class(1 = 1st; 2 = 2nd; 3 = 3rd) (Pclass is a proxy for socio-economic status (SES)
# 1st ~ Upper; 2nd ~ Middle; 3rd ~ Lower)
# Name Name
# Sex Sex
# Age Age (Age is in Years; Fractional if Age less than One (1) If the Age is Estimated, it is in the form xx.5)
# Sibsp Number of Siblings/Spouses Aboard
# Parch Number of Parents/Children Aboard
# Ticket Ticket Number
# Fare Passenger Fare
# Cabin Cabin
# Embarked Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
### ---------------------------------------------------------------------------
### Set up, data import and inspections
# Load packages after they have been installed.
library(psych)
library(scatterplot3d)
library(lattice)
library(MASS)
# Import a csv file
dir <- "~/R/IS_6482/MA1"
inputfile <- gettextf('%s/titanic.train.csv', dir)
titanic <- read.csv(file = inputfile, stringsAsFactors = FALSE)
# Examine the overall data frame
# str() shows the number of observations, and the number, names, types and some values of columns
## str(titanic)
# You can retrieve and save the number of rows and number of coloumns of a data frame
nrow(titanic)
## [1] 891
row <- nrow(titanic)
row
## [1] 891
col <- ncol(titanic)
col
## [1] 12
# Show the head and tail rows of a data frame
head(titanic)
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## Name Sex Age SibSp
## 1 Braund, Mr. Owen Harris male 22 1
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1
## 3 Heikkinen, Miss. Laina female 26 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1
## 5 Allen, Mr. William Henry male 35 0
## 6 Moran, Mr. James male NA 0
## Parch Ticket Fare Cabin Embarked
## 1 0 A/5 21171 7.2500 S
## 2 0 PC 17599 71.2833 C85 C
## 3 0 STON/O2. 3101282 7.9250 S
## 4 0 113803 53.1000 C123 S
## 5 0 373450 8.0500 S
## 6 0 330877 8.4583 Q
titanic[1:6,]
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## Name Sex Age SibSp
## 1 Braund, Mr. Owen Harris male 22 1
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1
## 3 Heikkinen, Miss. Laina female 26 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1
## 5 Allen, Mr. William Henry male 35 0
## 6 Moran, Mr. James male NA 0
## Parch Ticket Fare Cabin Embarked
## 1 0 A/5 21171 7.2500 S
## 2 0 PC 17599 71.2833 C85 C
## 3 0 STON/O2. 3101282 7.9250 S
## 4 0 113803 53.1000 C123 S
## 5 0 373450 8.0500 S
## 6 0 330877 8.4583 Q
head(titanic, n=1)
## PassengerId Survived Pclass Name Sex Age SibSp Parch
## 1 1 0 3 Braund, Mr. Owen Harris male 22 1 0
## Ticket Fare Cabin Embarked
## 1 A/5 21171 7.25 S
titanic[1,]
## PassengerId Survived Pclass Name Sex Age SibSp Parch
## 1 1 0 3 Braund, Mr. Owen Harris male 22 1 0
## Ticket Fare Cabin Embarked
## 1 A/5 21171 7.25 S
head(titanic, n=10)
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## 7 7 0 1
## 8 8 0 3
## 9 9 1 3
## 10 10 1 2
## Name Sex Age SibSp
## 1 Braund, Mr. Owen Harris male 22 1
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1
## 3 Heikkinen, Miss. Laina female 26 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1
## 5 Allen, Mr. William Henry male 35 0
## 6 Moran, Mr. James male NA 0
## 7 McCarthy, Mr. Timothy J male 54 0
## 8 Palsson, Master. Gosta Leonard male 2 3
## 9 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27 0
## 10 Nasser, Mrs. Nicholas (Adele Achem) female 14 1
## Parch Ticket Fare Cabin Embarked
## 1 0 A/5 21171 7.2500 S
## 2 0 PC 17599 71.2833 C85 C
## 3 0 STON/O2. 3101282 7.9250 S
## 4 0 113803 53.1000 C123 S
## 5 0 373450 8.0500 S
## 6 0 330877 8.4583 Q
## 7 0 17463 51.8625 E46 S
## 8 1 349909 21.0750 S
## 9 2 347742 11.1333 S
## 10 0 237736 30.0708 C
tail(titanic, n = 10)
## PassengerId Survived Pclass Name
## 882 882 0 3 Markun, Mr. Johann
## 883 883 0 3 Dahlberg, Miss. Gerda Ulrika
## 884 884 0 2 Banfield, Mr. Frederick James
## 885 885 0 3 Sutehall, Mr. Henry Jr
## 886 886 0 3 Rice, Mrs. William (Margaret Norton)
## 887 887 0 2 Montvila, Rev. Juozas
## 888 888 1 1 Graham, Miss. Margaret Edith
## 889 889 0 3 Johnston, Miss. Catherine Helen "Carrie"
## 890 890 1 1 Behr, Mr. Karl Howell
## 891 891 0 3 Dooley, Mr. Patrick
## Sex Age SibSp Parch Ticket Fare Cabin Embarked
## 882 male 33 0 0 349257 7.8958 S
## 883 female 22 0 0 7552 10.5167 S
## 884 male 28 0 0 C.A./SOTON 34068 10.5000 S
## 885 male 25 0 0 SOTON/OQ 392076 7.0500 S
## 886 female 39 0 5 382652 29.1250 Q
## 887 male 27 0 0 211536 13.0000 S
## 888 female 19 0 0 112053 30.0000 B42 S
## 889 female NA 1 2 W./C. 6607 23.4500 S
## 890 male 26 0 0 111369 30.0000 C148 C
## 891 male 32 0 0 370376 7.7500 Q
# summary() shows the mean and the five-number statistics indicating the spread of each column's values
## summary(titanic)
# Remove unique identifiers from further analysis as they are not interesting without additional feature extractions
titanic <- titanic[c(-1,-4,-9)]
# Change Survived and other nominal variables to factors
titanic$Survived <- factor(titanic$Survived)
titanic$Sex <- factor(titanic$Sex)
titanic$Pclass <- factor(titanic$Pclass)
titanic$Cabin <- factor(titanic$Cabin)
titanic$Embarked <- factor(titanic$Embarked)
str(titanic)
## 'data.frame': 891 obs. of 9 variables:
## $ Survived: Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
## $ Pclass : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 3 1 3 3 2 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
## $ Embarked: Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
## summary(titanic)
# Remove observations with missing Age values.
# This missing data handling approach has the obvious disadvantages of
# the applicability of the model to data with missing age.
# To keep observations with missing Age values require careful imputation of Age missingness.
# The various missing data imputation methods are beyond the knowledge required for this tutorial.
# Missing values in Age can cause problems. Test the following commands when you have time.
# var(titanic$Age)
# mean(titanic$Age)
# var(titanic[-which(is.na(titanic$Age)), ]$Age)
# sd(titanic[-which(is.na(titanic$Age)), ]$Age)
# Use which() command to identify and remove observations in which
# the Age value is missing or is.na(titanic Age) = TRUE.
titanic <- titanic[-which(is.na(titanic$Age)), ]
## summary(titanic)
# On the other hand, keeping missing factor levels might be able to lead to meaningful models
# Empty level names of the Cabin and Embarked factors will cause problems in some analysis.
# Other missing factor value imputation remains a good option beyond the scope of this tutorial.
# levels() and sort() with factors
## str(titanic$Cabin)
## titanic$Cabin
# str() truncate the display of factor levels. levels() shows all of the factor levels of a factor variable
## levels(titanic$Cabin)
# It is useful to sort the levels based on how many observations contain a factor level.
# Remember table() that counts how many observations contain a factor level.
# Use sort to sort factor levels by the count of observations containing a level
# You can use sort inside barplot()
table(titanic$Cabin)
##
## A10 A14 A16
## 529 1 0 1
## A19 A20 A23 A24
## 0 1 1 1
## A26 A31 A32 A34
## 1 1 0 1
## A36 A5 A6 A7
## 1 1 1 1
## B101 B102 B18 B19
## 1 0 2 1
## B20 B22 B28 B3
## 2 2 2 1
## B30 B35 B37 B38
## 1 2 1 1
## B39 B4 B41 B42
## 1 1 1 1
## B49 B5 B50 B51 B53 B55
## 2 2 1 2
## B57 B59 B63 B66 B58 B60 B69 B71
## 2 2 1 1
## B73 B77 B78 B79
## 1 2 0 1
## B80 B82 B84 B86 B94
## 1 1 1 1
## B96 B98 C101 C103 C104
## 4 1 1 1
## C106 C110 C111 C118
## 0 1 1 1
## C123 C124 C125 C126
## 2 1 2 1
## C128 C148 C2 C22 C26
## 0 1 2 3
## C23 C25 C27 C30 C32 C45
## 4 1 1 1
## C46 C47 C49 C50
## 1 0 1 1
## C52 C54 C62 C64 C65
## 1 1 1 2
## C68 C7 C70 C78
## 2 1 1 2
## C82 C83 C85 C86
## 1 2 1 1
## C87 C90 C91 C92
## 1 1 1 1
## C93 C95 C99 D
## 2 0 1 3
## D10 D12 D11 D15 D17
## 1 1 1 2
## D19 D20 D21 D26
## 1 2 0 2
## D28 D30 D33 D35
## 1 1 2 2
## D36 D37 D45 D46
## 2 1 0 1
## D47 D48 D49 D50
## 1 1 1 1
## D56 D6 D7 D9
## 1 1 1 1
## E10 E101 E12 E121
## 1 2 1 2
## E17 E24 E25 E31
## 1 2 2 1
## E33 E34 E36 E38
## 1 1 1 1
## E40 E44 E46 E49
## 1 2 1 1
## E50 E58 E63 E67
## 1 1 1 2
## E68 E77 E8 F2
## 1 1 2 3
## F33 F38 F4 F E69
## 3 0 2 0
## F G63 F G73 G6 T
## 1 2 4 1
sort(table(titanic$Cabin), decreasing = TRUE)
##
## B96 B98 C23 C25 C27 G6
## 529 4 4 4
## C22 C26 D F2 F33
## 3 3 3 3
## B18 B20 B22 B28
## 2 2 2 2
## B35 B49 B5 B51 B53 B55
## 2 2 2 2
## B57 B59 B63 B66 B58 B60 B77 C123
## 2 2 2 2
## C125 C2 C65 C68
## 2 2 2 2
## C78 C83 C93 D17
## 2 2 2 2
## D20 D26 D33 D35
## 2 2 2 2
## D36 E101 E121 E24
## 2 2 2 2
## E25 E44 E67 E8
## 2 2 2 2
## F4 F G73 A10 A16
## 2 2 1 1
## A20 A23 A24 A26
## 1 1 1 1
## A31 A34 A36 A5
## 1 1 1 1
## A6 A7 B101 B19
## 1 1 1 1
## B3 B30 B37 B38
## 1 1 1 1
## B39 B4 B41 B42
## 1 1 1 1
## B50 B69 B71 B73
## 1 1 1 1
## B79 B80 B82 B84 B86
## 1 1 1 1
## B94 C101 C103 C104
## 1 1 1 1
## C110 C111 C118 C124
## 1 1 1 1
## C126 C148 C30 C32
## 1 1 1 1
## C45 C46 C49 C50
## 1 1 1 1
## C52 C54 C62 C64 C7
## 1 1 1 1
## C70 C82 C85 C86
## 1 1 1 1
## C87 C90 C91 C92
## 1 1 1 1
## C99 D10 D12 D11 D15
## 1 1 1 1
## D19 D28 D30 D37
## 1 1 1 1
## D46 D47 D48 D49
## 1 1 1 1
## D50 D56 D6 D7
## 1 1 1 1
## D9 E10 E12 E17
## 1 1 1 1
## E31 E33 E34 E36
## 1 1 1 1
## E38 E40 E46 E49
## 1 1 1 1
## E50 E58 E63 E68
## 1 1 1 1
## E77 F G63 T A14
## 1 1 1 0
## A19 A32 B102 B78
## 0 0 0 0
## C106 C128 C47 C95
## 0 0 0 0
## D21 D45 F38 F E69
## 0 0 0 0
barplot(sort(table(titanic$Cabin), decreasing = TRUE))
barplot(sort(table(titanic$Cabin), decreasing = FALSE))
# Fixing empty character level names for Cabin and Embarked
## levels(titanic$Cabin)[1]
levels(titanic$Cabin)[1] <- "missing"
levels(titanic$Embarked)
## [1] "" "C" "Q" "S"
barplot(sort(table(titanic$Embarked), decreasing = FALSE))
levels(titanic$Embarked)[1]
## [1] ""
levels(titanic$Embarked)[1] <- "missing"
summary(titanic)
## Survived Pclass Sex Age SibSp
## 0:424 1:186 female:261 Min. : 0.42 Min. :0.0000
## 1:290 2:173 male :453 1st Qu.:20.12 1st Qu.:0.0000
## 3:355 Median :28.00 Median :0.0000
## Mean :29.70 Mean :0.5126
## 3rd Qu.:38.00 3rd Qu.:1.0000
## Max. :80.00 Max. :5.0000
##
## Parch Fare Cabin Embarked
## Min. :0.0000 Min. : 0.00 missing :529 missing: 2
## 1st Qu.:0.0000 1st Qu.: 8.05 B96 B98 : 4 C :130
## Median :0.0000 Median : 15.74 C23 C25 C27: 4 Q : 28
## Mean :0.4314 Mean : 34.69 G6 : 4 S :554
## 3rd Qu.:1.0000 3rd Qu.: 33.38 C22 C26 : 3
## Max. :6.0000 Max. :512.33 D : 3
## (Other) :167
Code Chunk #2: Numerical variable summaries including basic functions, quantiles and boxplots.
### understanding a single variable: numerical variables
# Show summary of one or more columns
summary(titanic$Pclass)
## 1 2 3
## 186 173 355
summary(titanic[c("Sex", "Age")])
## Sex Age
## female:261 Min. : 0.42
## male :453 1st Qu.:20.12
## Median :28.00
## Mean :29.70
## 3rd Qu.:38.00
## Max. :80.00
# obtain the mean, median, max, min and range of a numeric variable
mean(titanic$Age)
## [1] 29.69912
median(titanic$Age)
## [1] 28
range(titanic$Age)
## [1] 0.42 80.00
# calculate the difference of the range
max.Age <- max(titanic$Age)
min.Age <- min(titanic$Age)
min.Age
## [1] 0.42
max.Age
## [1] 80
range.diff.Age <- max.Age - min.Age
range.diff.Age
## [1] 79.58
# min-max normalization of first observation's Age to a value between zero zna 1
titanic$Age[1]
## [1] 22
(titanic$Age[1] - min.Age)/range.diff.Age
## [1] 0.2711737
# diff() calculates differences between an attribute's values in the referenced record and that in the record following it
# diff(titanic$Age)
# ?diff
# titanic$Age
# use quantile to calculate the five-number summary for Age
quantile(titanic$Age)
## 0% 25% 50% 75% 100%
## 0.420 20.125 28.000 38.000 80.000
# the 1st percentile and the 99th percentile
quantile(titanic$Age, probs = c(0.01, 0.99))
## 1% 99%
## 1.00 65.87
# quintiles and dectiles
quantile(titanic$Age, seq(from = 0, to = 1, by = 0.20))
## 0% 20% 40% 60% 80% 100%
## 0.42 19.00 25.00 31.80 41.00 80.00
quantile(titanic$Age, seq(from = 0, to = 1, by = 0.10))
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 0.42 14.00 19.00 22.00 25.00 28.00 31.80 36.00 41.00 50.00 80.00
# boxplot of a numeric variable
boxplot(titanic$Age, main="Boxplot of Age in the titanic data set",
ylab="Age")
# practice quantile for another variable - SibSp
quantile(titanic$SibSp)
## 0% 25% 50% 75% 100%
## 0 0 0 1 5
# the 1st percentile and the 99th percentile
quantile(titanic$SibSp, probs = c(0.01, 0.99))
## 1% 99%
## 0 4
# quintiles and dectiles
quantile(titanic$SibSp, seq(from = 0, to = 1, by = 0.20))
## 0% 20% 40% 60% 80% 100%
## 0 0 0 0 1 5
quantile(titanic$SibSp, seq(from = 0, to = 1, by = 0.10))
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 0 0 0 0 0 0 0 1 1 1 5
# boxplot of a numeric variable
boxplot(titanic$SibSp, main="Boxplot of Sibsp in the titanic data set",
ylab="Sibsp")
# IQR = 3rdQuintile - 1stQuintile
# maxline = 3rdQuintile + 1.5*IQR
# minline = 1stQuintile - 1.5*IQR
# one definition of outliers : value > maxline and value < minline are drawn as circles and dots
# scatter plot of a numeric variable. It becomes too busy for a large number of observations
plot(titanic$SibSp)
# histograms of a numeric variable
hist(titanic$SibSp, main = "Histogram of Sibsp in the titanic data set",
xlab = "Sibsp")
### For a right skewed distribution, the mean is typically greater than the median
hist(titanic$Age, main = "Histogram of Age in the titanic data set",
xlab = "Age")
hist(titanic$Fare, main = "Histogram of Fare in the titanic data set",
xlab = "Fare")
hist(titanic$Parch, main = "Histogram of Parch in the titanic data set",
xlab = "Parch")
# Search for and understand the meaning of skewed, bimodal and multimodal continuous distributions
# variance and standard deviation of a numeric varaible
var(titanic$Age)
## [1] 211.0191
sd(titanic$Age)
## [1] 14.5265
var(titanic$SibSp)
## [1] 0.8644973
sd(titanic$SibSp)
## [1] 0.9297835
Code Chunk #3: Exploration of factor variables including dotplots, barplots and a table function with an error.
### Exploring factor variables
# A factor's distinct values
is.factor(titanic$Survived)
## [1] TRUE
summary(titanic$Survived)
## 0 1
## 424 290
nlevels(titanic$Survived)
## [1] 2
str(titanic$Survived)
## Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 2 2 2 ...
is.factor(as.character(titanic$Survived))
## [1] FALSE
summary(as.character(titanic$Survived))
## Length Class Mode
## 714 character character
# How many of observations have a factor value?
plot(titanic$Survived)
plot(titanic$Survived, main = "Plot of Survived in the titanic data set",
xlab = "Survived")
Survived.table <- table(titanic$Survived)
Survived.table
##
## 0 1
## 424 290
str(Survived.table)
## 'table' int [1:2(1d)] 424 290
## - attr(*, "dimnames")=List of 1
## ..$ : chr [1:2] "0" "1"
barplot(Survived.table, main = "Plot of Survived in the titanic data set",
xlab = "Survived")
# remember the difference between the input data structures to plot() and barplot()
# compute table proportions
# Run prop.table(titanic$Survived) to see the error in this command
prop.table(Survived.table)
##
## 0 1
## 0.5938375 0.4061625
Survived.prop <- prop.table(table(titanic$Survived))
Survived.prop
##
## 0 1
## 0.5938375 0.4061625
# round decimals
round(Survived.prop, digits = 2)
##
## 0 1
## 0.59 0.41
options(digits = 2)
prop.table(Survived.prop)
##
## 0 1
## 0.59 0.41
# Remeber to get help from Help Pane using "?"
Code Chunk #4: Exploring relationships between variables, scatter plots, boxplots, 3D scatterplots and parallel coordinate plots.
### Understand relationships of multiple variables
# cor, boxplot, 2D scatter plot - plot, 3D scatter plot
# scatter plot: two numeric variables
plot(titanic[,5:6])
plot(titanic$SibSp, titanic$Parch)
# Generate correlation coefficients of two numeric variables in a 2x2 matrix
# cor(X,Y) lies between -1 and 1. zero means no correlation. 1 or -1 indicates full correlation
# positive value means positive correlation and negative values mean negative relationships
# Examine the components in the formulation for correlation coefficients
# cor(X,Y) = cov(X,Y)/(sd(X)*sd(Y))
# cov(X,Y) = E[X-E(X)]*E[Y-E(Y)]
cov(titanic[,c(5,6)])
## SibSp Parch
## SibSp 0.86 0.30
## Parch 0.30 0.73
var(titanic[,c(5,6)])
## SibSp Parch
## SibSp 0.86 0.30
## Parch 0.30 0.73
var(titanic[,5])
## [1] 0.86
sd(titanic[,5])
## [1] 0.93
var(titanic[5])
## SibSp
## SibSp 0.86
sd(titanic[,2])
## Warning in var(if (is.vector(x) || is.factor(x)) x else as.double(x), na.rm = na.rm): Calling var(x) on a factor x is deprecated and will become an error.
## Use something like 'all(duplicated(x)[-1L])' to test for a constant vector.
## [1] 0.84
cor(titanic[c("SibSp", "Parch")])
## SibSp Parch
## SibSp 1.00 0.38
## Parch 0.38 1.00
cor(titanic[5:6])
## SibSp Parch
## SibSp 1.00 0.38
## Parch 0.38 1.00
cor(titanic[,5:6])
## SibSp Parch
## SibSp 1.00 0.38
## Parch 0.38 1.00
# Generate the correlation matrix of all numeric variables
cor(titanic[4:7])
## Age SibSp Parch Fare
## Age 1.000 -0.31 -0.19 0.096
## SibSp -0.308 1.00 0.38 0.138
## Parch -0.189 0.38 1.00 0.205
## Fare 0.096 0.14 0.21 1.000
# Generate 2D scatter plots and correlation coefficients
pairs(titanic[4:7])
## pairs.panels(titanic[-1])
## pairs.panels(titanic)
pairs.panels(titanic[,c('Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked')])
## Examine relationships between numeric variables and factors
# boxplot groups values of a numeric variable based on the values of a factor
boxplot(Age~Survived, data = titanic)
boxplot(Parch~Survived, data = titanic)
boxplot(SibSp~Survived, data = titanic)
boxplot(Fare~Survived, data = titanic)
# The aggregate function
# We can use the aggregate command to aggregate a numeric feature by a categorical one.
# The aggregate function has three parameters
# 1. The numeric value, e.g. sales, to be aggregated to find out, e.g., total of sales,
# average of sales, number of sales (i.e. orders).
# 2. The set of categories, product_category and sales_region, on which you wish
# to aggregate
# 3.The aggregation function (e.g., sum, mean, length) that you wish to use
aggregate(SibSp~Survived, summary, data = titanic)
## Survived SibSp.Min. SibSp.1st Qu. SibSp.Median SibSp.Mean SibSp.3rd Qu.
## 1 0 0.00 0.00 0.00 0.53 1.00
## 2 1 0.00 0.00 0.00 0.49 1.00
## SibSp.Max.
## 1 5.00
## 2 4.00
aggregate(Parch~Survived, summary, data = titanic)
## Survived Parch.Min. Parch.1st Qu. Parch.Median Parch.Mean Parch.3rd Qu.
## 1 0 0.00 0.00 0.00 0.37 0.00
## 2 1 0.00 0.00 0.00 0.53 1.00
## Parch.Max.
## 1 6.00
## 2 5.00
# scatter plot of numeric values and factor values
plot(titanic$Survived)
plot(titanic$Age)
plot(titanic$Age,titanic$Survived)
plot(titanic$SibSp,titanic$Parch, col=titanic$Survived, pch = as.numeric((titanic$Survived)))
with(titanic, plot(titanic$SibSp,titanic$Parch, col=Survived, pch = as.numeric(Survived)))
with(titanic, plot(titanic$SibSp,titanic$Parch, col=Survived, pch = as.numeric(Survived),
main = "2d scatter plot of titanic data", sub = "SibSp vs Parch"))
palette()
## [1] "black" "red" "green3" "blue" "cyan" "magenta" "yellow"
## [8] "gray"
legend('topright', legend = levels(titanic$Survived), col = 1:2, cex = 0.8, pch = 1:2)
# Generate 3D scatterplot
scatterplot3d(titanic$Sex,titanic$Age,titanic$SibSp, pch = as.numeric(titanic$Survived), main = "3D scatter plot of titanic data")
legend('topright', legend = levels(titanic$Survived), cex = 0.8, pch = 1:2)
scatterplot3d(titanic$Fare,titanic$Pclass,titanic$Sex, pch = as.numeric(titanic$Survived), main = "3D scatter plot of titanic data")
legend('topright', legend = levels(titanic$Survived), cex = 0.8, pch = 1:2)
titanicf <- subset(titanic, Sex == "female")
summary(titanicf)
## Survived Pclass Sex Age SibSp Parch
## 0: 64 1: 85 female:261 Min. : 1 Min. :0.0 Min. :0.0
## 1:197 2: 74 male : 0 1st Qu.:18 1st Qu.:0.0 1st Qu.:0.0
## 3:102 Median :27 Median :0.0 Median :0.0
## Mean :28 Mean :0.6 Mean :0.7
## 3rd Qu.:37 3rd Qu.:1.0 3rd Qu.:1.0
## Max. :63 Max. :5.0 Max. :6.0
##
## Fare Cabin Embarked
## Min. : 7 missing:171 missing: 2
## 1st Qu.: 13 G6 : 4 C : 61
## Median : 26 F33 : 3 Q : 12
## Mean : 48 B18 : 2 S :186
## 3rd Qu.: 58 B28 : 2
## Max. :512 B35 : 2
## (Other): 77
titanicm <- subset(titanic, Sex == "male")
summary(titanicm)
## Survived Pclass Sex Age SibSp Parch
## 0:360 1:101 female: 0 Min. : 0 Min. :0.0 Min. :0.0
## 1: 93 2: 99 male :453 1st Qu.:21 1st Qu.:0.0 1st Qu.:0.0
## 3:253 Median :29 Median :0.0 Median :0.0
## Mean :31 Mean :0.4 Mean :0.3
## 3rd Qu.:39 3rd Qu.:1.0 3rd Qu.:0.0
## Max. :80 Max. :5.0 Max. :5.0
##
## Fare Cabin Embarked
## Min. : 0 missing :358 missing: 0
## 1st Qu.: 8 F2 : 3 C : 69
## Median : 13 B51 B53 B55: 2 Q : 16
## Mean : 27 B96 B98 : 2 S :368
## 3rd Qu.: 28 C23 C25 C27: 2
## Max. :512 D26 : 2
## (Other) : 84
plot(titanicm$Age,titanicm$SibSp, pch = as.numeric(titanicm$Survived), main = "2D scatter plot of males' SibSp and Age")
legend('topright', legend = levels(titanicf$Survived), cex = 0.8, pch = 1:2)
plot(titanicf$Fare,titanicf$Pclass, pch = as.numeric(titanicf$Survived), main = "2D scatter plot of titanic females' Pclass vs Fare")
legend('topright', legend = levels(titanicf$Survived), cex = 0.8, pch = 1:2)
# Parallel plot and parallel coordinate plot to show relationships between numeric variables and factors
# Required libraries - lattice and MASS
parallelplot(~titanic[-1] | Survived, data = titanic, var.label = TRUE)
parallelplot(~titanic[4:7] | Survived, data = titanic, var.label = TRUE)
parallelplot(~titanic[4:5] | Survived, data = titanic, var.label = TRUE)
parallelplot(~titanic[c(4,6)] | Survived, data = titanic, var.label = TRUE)
parallelplot(~titanic[c(6,7)] | Survived, data = titanic, var.label = TRUE)
parallelplot(~titanic[c(5,6)] | Survived, data = titanic, var.label = TRUE)
palette()
## [1] "black" "red" "green3" "blue" "cyan" "magenta" "yellow"
## [8] "gray"
# legend() that works for other plots cannot display legend in the parallelplot() above
# Generate a parallelcoordinate plot
parcoord(titanic[4:7], col = titanic$Survived, var.label = TRUE)
parcoord(titanic[4:5], col = titanic$Survived, var.label = TRUE)
parcoord(titanic[c(4,6)], col = titanic$Survived, var.label = TRUE)
parcoord(titanic[c(4,7)], col = titanic$Survived, var.label = TRUE)
parcoord(titanic[c(5,6)], col = titanic$Survived, var.label = TRUE)
palette()
## [1] "black" "red" "green3" "blue" "cyan" "magenta" "yellow"
## [8] "gray"
legend('topright', legend = levels(titanic$Survived), col = 1:2, cex = 0.8, pch = 1:2)
##### end of Titanic Data Exploration Tutorial